/* Create our baseline datasets from Joe's complete_data */
clear *
cap cd "Create data"
set mem 1000m
set more off
set matsize 1000
set maxvar 3000

clear *
use "complete_data5"
drop if year==2004
gen dnp=1 if min==0
egen pid=group(player_id)
for X in num 0/3: gen ref_wX=1 if white==X \ replace ref_wX=0 if white~=X & white~=.

for X in any blocks turnover: replace X=. if dnp==0
gen ftmiss=fta-ftm
gen fg2a=fga-threept_att
gen fg2m=fgm-threept_made
gen fg2miss=fg2a-fg2m
gen fg3a=threept_att
gen fg3m=threept_made
gen fg3miss=fg3a-fg3m
gen fgmiss=fg2miss+fg3miss
order min fouls points fta ftm fga fgm rbo rbt assists steals blocks  technical flagrant threept_made threept_att turnover

* Fix a problems
replace team="atl" if team=="at"
replace home_team=team if home==1
replace home_team="phi" if team=="cha" & edate==mdy(12,1,2000)
replace televised=0 if year>=1997 & televised==.


* Sample selection
gen sample=1
replace sample=. if white==.  			//missing ref race, or only 2 refs
replace sample=. if black==.		//need to be careful here on team analysis
replace sample=. if home==. | attend==. | out_cont==. | height==. | weight==.
replace sample=. if min==0 | dnp==1    	//players who were injured or stayed on the bench

* Generate relevant identifiers

egen maxrefid=rmax(ref1_id ref2_id ref3_id)
summ maxrefid
global refs=r(max)
drop maxrefid
qui for X in num 1/$refs: quietly gen byte ridX =  (ref1_id==X)|(ref2_id==X)|(ref3_id==X) 
ds rid*
foreach v of varlist `r(varlist)' {
	cap summ `v'
	if r(mean)==0 {
		drop `v'
	}
} //dropping identifiers for referees who never referee.

egen player=group(pid)
egen playeryear = group(player year)
cap drop gameid
egen gameid = group(edate home_team)
egen teamgame=group(gameid team)
egen tm_yr=group(team year)
gen str10 blackplayer="Black" if black==1
replace blackplayer="White" if black==0


* Generate coefficient of interest
capture drop fracwhite
gen fracwhite = white/3
gen bl_fw = black*fracwhite

* Generate dependent variables
gen rbd=rbt-rbo
gen berri=points+rbo+rbd+steals+0.5*blocks+0.5*assists-fga-0.5*fta-turnover-0.5*fouls
for X in any fouls points fta ftm ftmiss fga fgm fgmiss fg2m fg2a fg2miss fg3m fg3a fg3miss rbo rbd rbt assists steals blocks  technical flagrant threept_made threept_att turnover berri: gen X_rate=48*X/min
gen ftper=ftm/fta
gen fgper=fgm/fga
gen efgper=(fgm+0.5*fg3m)/fga
gen tpper=threept_made/threept_att
gen fg2per=fg2m/fg2a
gen fg3per=fg3m/fg3a
egen rbous=sum(rbo), by(gameid team)
egen rboall=sum(rbo), by(gameid)
egen rbdus=sum(rbd), by(gameid team)
egen rbdall=sum(rbd), by(gameid)
gen rbo_opportunities=rbous+rbdall-rbdus
gen rbd_opportunities=rbdus+rboall-rbous
gen rbt_opportunities=rbo_opportunities+rbd_opportunities
for X in any rbo rbd rbt: gen Xper=X/(X_opportunities*min/48) \ gen Xper_wt=X_opportunities*min/48

gen fouledout=fouls>=6
for X in num 0/6: gen foulsX=fouls>=X if fouls~=.

* Generate control variables
replace out_cont=0 if year==1998  //strike year
gen age = (edate - birthdate)/365
gen exp = year - firstseason + 1
for X in var min assists blocks rbd rbo steals turnover fta ftm fg2a fg2m fg3a fg3m fouls: egen X_career=sum(X), by(player_id) 
gen p_min=min_career
* Generate "style" variables 
for X in var assists blocks rbd rbo steals turnover fta fg2a fg3a fouls: gen p_X=48*X_career/min_career 
for X in any fg2 fg3 ft: gen p_Xper=Xm_career/Xa_career \ replace p_Xper=-1 if Xa_career==. | Xa_career==0 \ gen p_zeroXper=(p_Xper==-1) \ summ p_Xper [aw=min] if p_Xper>=0 \ replace p_Xper=r(mean) if p_Xper==-1
for X in var $player_controls $playergame_controls $teamgame_controls $career_stats: gen xn_X=X*fracwhite 

* Format variables
for var fouls points technical flagrant foreign *_rate: format X %9.3f
for var white ref_*: format X %9.3f
for var televised out_cont coach_black starter: format X %9.3f
format min %9.2f
format attend %9.0f
format age %9.2f
format height %9.2f
format weight %9.1f
for var exp all_star center forward guard: format X %9.3f
replace attend=attend/1000

gen playergame=_n
sort team edate
compress
save individual_data, replace


*** Now create a team-level dataset
use individual_data, clear

* Crunch down to a team*game-based dataset
gen blackmin=(black==1)*min
gen whitemin=(black==0)*min
gen totalmin=min
gen blackstarter=(black==1)*starter
gen whitestarter=(black==0)*starter
gen starters=starter
for X in num 0/6: replace foulsX=. if min==0 | dnp==1
collapse (sum) sample min blackmin whitemin  blackstarter whitestarter starters berri fouls points fta ftm ftmiss fga fgm fgmiss fg2a fg2m fg2miss fg3a fg3m fg3miss rbo rbd rbt assists steals blocks  technical flagrant threept_made threept_att turnover fouledout fouls0-fouls6  (mean) out_cont attend home home_attend  score year coach_black coachid white fracwhite ref1_id ref2_id ref3_id rid* /*id*/ gameid teamgame tm_yr rbo_opportunities rbd_opportunities rbt_opportunities televised, by(team home_team edate)
gen ot=max(round((min-240)/25,1),0)
for X in var min-fouls6 out_cont-ref3_id: qui label variable X "X"
gen ftper=ftm/fta
gen fgper=fgm/fga
gen fg2per=fg2m/fg2a
gen fg3per=fg3m/fg3a
gen rboper=rbo/rbo_opportunities
gen rbdper=rbd/rbd_opportunities
gen rbtper=rbt/(rbo_opportunities+rbd_opportunities)

* Get opponent characteristics
sort gameid home
by gameid: gen opp_team=team[_n-1]
by gameid: replace opp_team=team[_n+1] if opp_team==""

sort gameid home
by gameid: gen opp_score=score[_n-1]
by gameid: replace opp_score=score[_n+1] if opp_score==.
gen diff_score=score-opp_score 
gen win=score>opp_score
gen margin=score-opp_score

foreach X in sample home ot min blackmin whitemin  blackstarter whitestarter starters berri diff_berri fouls points win margin fta ftm ftmiss fga fgm fgmiss fg2a fg2m fg2miss fg3a fg3m fg3miss rbo rbd rbt rboper rbdper rbtper assists steals blocks  technical flagrant turnover fouledout fouls0 fouls1 fouls2 fouls3 fouls4 fouls5 fouls6  out_cont coach_black coachid fgper fg2per fg3per ftper	{
	sort gameid home
	by gameid: gen opp_`X'=`X'[_n-1]
	by gameid: replace opp_`X'=`X'[_n+1] if opp_`X'==.
	gen diff_`X'=`X'-opp_`X'
}

replace sample=1 if sample>1
replace sample=. if opp_sample==. | opp_sample==0
egen min_sum_min=min(min), by(gameid)
replace sample=1 if min_sum_min>=200
replace sample=. if points~=score | opp_points~=opp_score | score+opp_score==. | min~=opp_min
replace sample=. if sample==0

* Generate team-level variables
gen blackper=blackmin/(blackmin+whitemin)
gen opp_blackper=opp_blackmin/(opp_blackmin+opp_whitemin)
gen diff_blackper=blackper-opp_blackper

for ! in varlist blackper fracwhite $teamgame_controls $oppgame_controls: replace sample=. if !==.

for any $teamgame_controls $oppgame_controls: gen xn_X=X*fracwhite
gen bl_fw=blackper*fracwhite
gen opp_bl_fw=opp_blackper*fracwhite
gen diff_bl_fw=bl_fw-opp_bl_fw

/* Merge in betting odds
use team_data, clear
sort home_team edate
merge home_team edate using "betting_odds"
drop if _merge==2
gen spread=-odds1 if home==1
replace spread=odds1 if home==0
gen vspread=margin+spread
drop odds1 odds2 result1 result2 season2 
rename _merge _merge_betting
save team_data, replace
*/
compress
sort team edate
save team_data, replace 

* Save in the directory above
for X in any individual_data team_data: copy "X.dta" "..\X.dta", replace \ erase "X.dta"
cd ..
